
global smartphone_data "${raw_data}/smartphone_contact_matrix"

cap program drop read_prison_data 
program define read_prison_data

	syntax , prison(string) [contact_matrix_path(string)]

	use "${intermediate_data}/CA_ZCTA_uniq_list.dta", clear

	ren ZCTA ZCTA_contact 

	* Form every pairwise combination of ZCTA-prison -> expand data to ZCTA-prison level
	cross using "${intermediate_data}/cdcr_population_w_ShapefileID.dta" 

	keep ZCTA_contact num_zip NAME institution_name COUNTY COUNTYFIPS prison_zip prison_ZCTA population_felons_201912 designed_capacity_201912 staffed_capacity_201912
	ren COUNTY prison_cty
	ren COUNTYFIPS prison_cty_FIPS

	* join to get all prisons covid case in a month (from 03/2020-12/2020, for 35 prisons) 
	joinby NAME using "${intermediate_data}/CDCR_covid_19_for_merged_monthly.dta"
	drop totalconfirmed_2020 totaldeaths_2020 distinctpatientstested_2020
	rename new_cases prison_new_cases 

	* merge ZCTA prison contact matrix 
	if "`contact_matrix_path'" == ""{
		merge 1:1 ZCTA_contact NAME month using "${smartphone_data}/prison_time_from_Jun-OctHome_ZCTA.dta", ///
			keepusing(hr_pri_fr_home prison_zip prison_ZCTA) keep(master match)	
	}
	else{
		merge 1:1 ZCTA_contact NAME month using "${smartphone_data}/`contact_matrix_path'", ///
			keepusing(hr_pri_fr_home prison_zip prison_ZCTA) keep(master match)			
	}

	// note: data w/ _merge == 2 includes other 5 facilities (hopistal/transitional) as well as contact in Jan, Feb 

	* fill in zeros for the contact matrix 
	foreach var of varlist hr_pri_fr_home{
		replace `var' = 0 if _merge == 1 & month >= 3 & month <= 10
	}
	drop _merge 

	* sort data
	sort NAME month ZCTA_contact 
	* only keep variables needed 
	keep NAME month ZCTA_contact num_zip hr_pri_fr_home prison_ZCTA

	* keep `prison' for event study
	keep if NAME == "`prison'"
				
	* merge ZCTA monthly covid case data
	ren ZCTA_contact ZCTA
	merge m:1 ZCTA month using "${intermediate_data}/CA_ZCTA_cases_p100k_bymonth.dta"
	* fill in zeros for covid cases
	replace max_cases_p100k = 0 if _merge == 1
	replace min_cases_p100k = 0 if _merge == 1
	drop if _merge == 2 // these include county observations in month 1-2
	drop _merge 
	ren max_cases_p100k ZCTA_max_cases_p100k
	ren min_cases_p100k ZCTA_min_cases_p100k

	* merge ZCTA month covid cases sum from prison
	* this is because variable ZCTA_max_cases_p100k / ZCTA_min_cases_p100k can include cases from prison 
	merge m:1 ZCTA month using "${intermediate_data}/CA_ZCTA_covid_from_prison_monthly.dta", ///
		keepusing(ZCTA_cases_fr_prison ZCTA_cases_fr_pri_p100k)
	replace ZCTA_cases_fr_prison = 0 if _merge == 1
	replace ZCTA_cases_fr_pri_p100k = 0 if _merge == 1
	drop _merge 

	* merge to get ZCTA population (ACS 17-21) 
	merge m:1 ZCTA using "${intermediate_data}/ZCTA_pop_race_acs_17_21.dta", ///
		keep(master match) keepusing(tot_population_acs_17_21) nogenerate
	ren ZCTA ZCTA_contact 
	ren tot_population_acs_17_21 ZCTA_population_acs_17_21

	* merge to get ZCTA demographic variables (ACS 15-19)
	rename ZCTA_contact zcta
	merge m:1 zcta using "${raw_data}/ZCTA_acs_15_19/ZCTA_demographics_acs_15_19.dta", keep(master match) nogenerate
	rename zcta ZCTA_contact
	
	* merge to get indicator of prison locating in the ZCTA
	rename ZCTA_contact ZCTA
	merge m:1 ZCTA using "${intermediate_data}/prison_ZCTA_list.dta", keep(master match)
	replace has_state_prison = 0 if _merge == 1 & missing(has_state_prison)
	drop _merge 
	rename ZCTA ZCTA_contact
	
	gen log_tot_pop_acs_15_19 = log(tot_pop_acs_15_19) // ACS 15-19 population 
	gen log_ZCTA_population = log(ZCTA_population_acs_17_21) // ACS 17-21 population 
	// main difference is that all ZCTA_population_acs_17_21 > 0 while 1 ZCTA has tot_pop_acs_15_19 == 0
	
	* adjust zipcode case to remove prison cases 
	gen ZCTA_max_cases_p100k_rmpri =  max(ZCTA_max_cases_p100k - ZCTA_cases_fr_pri_p100k, 0) 
	gen ZCTA_min_cases_p100k_rmpri =  max(ZCTA_min_cases_p100k - ZCTA_cases_fr_pri_p100k, 0)
	drop ZCTA_max_cases_p100k ZCTA_min_cases_p100k // drop original variables 

	* recover zipcode case count
	gen ZCTA_max_cases_rmpri = round(ZCTA_max_cases_p100k_rmpri * ZCTA_population_acs_17_21 / 100000, 1)
	gen ZCTA_min_cases_rmpri = round(ZCTA_min_cases_p100k_rmpri * ZCTA_population_acs_17_21 / 100000, 1)

	* tag ZCTA
	egen tag_ZCTA_contact = tag(ZCTA_contact)
	
	* calculate cumulative cases 
	sort ZCTA_contact month 
	gen cum_max_cases_p100k_rmpri = ZCTA_max_cases_p100k_rmpri if month == 3
	replace cum_max_cases_p100k_rmpri =  cum_max_cases_p100k_rmpri[_n-1] + ZCTA_max_cases_p100k_rmpri ///
		if ZCTA_contact == ZCTA_contact[_n-1] & month == month[_n-1] + 1

	* calculate contact/cases in may 
	forval i = 5/6{
		gen hr_pri_fr_home`i'_ = hr_pri_fr_home if month == `i'
		gegen hr_pri_fr_home`i' = max(hr_pri_fr_home`i'_), by(ZCTA_contact)
		drop hr_pri_fr_home`i'_
		
		gen ZCTA_max_cases_p100k_rmpri`i'_ = ZCTA_max_cases_p100k_rmpri if month == `i'
		gegen ZCTA_max_cases_p100k_rmpri`i' = max(ZCTA_max_cases_p100k_rmpri`i'_ ), by(ZCTA_contact)
		drop ZCTA_max_cases_p100k_rmpri`i'_ 
	}

	* calculate cubic polynomial for cases 
	gen ZCTA_max_cases_p100k_rmpri5_sq = ZCTA_max_cases_p100k_rmpri5 ^ 2
	gen ZCTA_max_cases_p100k_rmpri5_cu = ZCTA_max_cases_p100k_rmpri5 ^ 3

	gen cum_max_cases_p100k_rmpri5_ = cum_max_cases_p100k_rmpri if month == 5
	gegen cum_max_cases_p100k_rmpri5 = max(cum_max_cases_p100k_rmpri5_ ), by(ZCTA_contact)
	drop cum_max_cases_p100k_rmpri5_ 

	*define treatment group and first treated month for csdid command (June)
	gen treated = (hr_pri_fr_home6 > 0)
	gen first_treated = 6 * (hr_pri_fr_home6 > 0) // month that ZCTA is first treated

	*calculate average of the per-period contact (hr_pri_fr_home) 
	gen byte pre = (month <= 5)
	gegen preavg_hr_pri_fr_home_ = mean(hr_pri_fr_home) if pre == 1, by(ZCTA_contact)
	gegen preavg_hr_pri_fr_home = max(preavg_hr_pri_fr_home), by(ZCTA_contact)
	drop preavg_hr_pri_fr_home_ 

	*define treatment group and first treated month based on per-period average contact
	gen treated_pre = (preavg_hr_pri_fr_home > 0)
	gen first_treated_pre = 6 * (preavg_hr_pri_fr_home > 0) // month that ZCTA is first treated

	if "`prison'" == "SAN QUENTIN STATE PRISON (SQ)"{
		local name = "SQ"
	}
	if "`prison'" == "CALIFORNIA STATE PRISON, CORCORAN (COR)"{
		local name = "COR"
	}
	
	label variable pct_nh_blk_alone_acs_15_19 "\% Black (ACS 15-19)"
	label variable pct_nh_white_alone_acs_15_19  "\% White (ACS 15-19)"
	label variable pct_hisp_latino_acs_15_19  "\% Hispanic (ACS 15-19)"
	label variable pct_college_acs_15_19  "\% College (ACS 15-19)"
	label variable pct_pop_65plus_acs_15_19  "\% Age Above 65 (ACS 15-19)"
	label variable med_hhc_inc_acs_15_19  "Med HH Cncome (ACS 15-19)"
	label variable log_tot_pop_acs_15_19 "Log Population (ACS 15-19)"
	label variable ZCTA_max_cases_p100k_rmpri5 "Zip Code Cases per 100k (May)"
	label variable cum_max_cases_p100k_rmpri5 "Cumulative Zip Code Cases per 100k (May)"
	
	drop if month >= 11 
	
	***************************************
	* Merge to get LODES data
	***************************************
	rename ZCTA_contact ZCTA 
	merge m:1 ZCTA NAME using "${intermediate_data}/LODES_2020_SQSPBG_to_homeZCTA.dta", ///
		keepusing(zcta_s000) keep(master match) nogenerate
	rename ZCTA ZCTA_contact 
	
	*define treatment group and first treated month for csdid command (June)
	gen treated_lodes = (zcta_s000 > 0)
	gen first_treated_lodes = 6 * (zcta_s000 > 0) // month that ZCTA is first treated

	gen cum_cases5_X_pct_white = cum_max_cases_p100k_rmpri5 * pct_nh_white_alone_acs_15_19
	gen cases_5_X_pct_white = ZCTA_max_cases_p100k_rmpri5 * pct_nh_white_alone_acs_15_19
	gen cases_sq_5_X_pct_white = ZCTA_max_cases_p100k_rmpri5_sq * pct_nh_white_alone_acs_15_19
	gen cases_cu_5_X_pct_white = ZCTA_max_cases_p100k_rmpri5_cu * pct_nh_white_alone_acs_15_19

	gen cum_cases5_X_inc = cum_max_cases_p100k_rmpri5 * med_hhc_inc_acs_15_19 
	gen cases_5_X_inc = ZCTA_max_cases_p100k_rmpri5 * med_hhc_inc_acs_15_19 
	gen cases_sq_5_X_inc = ZCTA_max_cases_p100k_rmpri5_sq * med_hhc_inc_acs_15_19 
	gen cases_cu_5_X_inc = ZCTA_max_cases_p100k_rmpri5_cu * med_hhc_inc_acs_15_19 

	* define treated group as those > `i' half-hour (rather than half hour)
	forval i = 1/5{
		gen first_treated_`i'HH = 6 * (hr_pri_fr_home6 > 0.5 * `i')	// HH means half-hour
		gen treated_`i'HH = (hr_pri_fr_home6 > 0.5 * `i')
	}
	gen first_treated_10HH = 6 * (hr_pri_fr_home6 > 5)
	gen first_treated_20HH = 6 * (hr_pri_fr_home6 > 10)

end 


